* Set current directory and open log
cd "E:\Seagate Sync\VOL\Personal folder\Documents\Research\Current projects\Working party meetings\Posted\EUP 160404"
capture log close
log using "Data management\wpar-management06-workingparties03-merge", replace text


*****************************************************************************
* Reshaping of coded meeting data sets and merging with group population data
*****************************************************************************


* Project and author information
********************************
* Programme:	wpar-management06-workingparties03-merge.do
* Date:			4 April 2016
* Author:		Frank Haege, University of Limerick
* Contact:		frank.haege@ul.ie 

* Do-file description
*********************
* This do-file first identifies and drops all duplicate observations that refer to the 
* same working party meeting. Subsequently, it reshapes the dataset to generate additional
* observations for each working party taking part in a conjoint meeting (empirical maximum is three
* groups). From this reshaped dataset, new duplicates that can now be identified are also removed.
* Finally, the script drops the shorter of two meetings by the same working party during the
* same time and day (because they are partial duplicates).


* Clear memory  
**************
version 12.0
clear all
macro drop _all
set more off

* Load dataset
use "Data management\wpar-management05-workingparties02-code", clear
des, s


****************************************
* Remove duplicates from meeting dataset
****************************************

* Tabulate codes
tab wpcode, m
tab wpcode2
* 1697 observations
tab wpcode3
* 36 observations

* Generate meeting id variable
sort date slot wpcode wpname
generate rno = _n
label var rno "WP meeting number"
order rno

* Generate indicator variable for joint meetings
generate joint = 0
replace joint = 1 if wpcode2 != ""
label var joint "Joint meeting (yes/no)"
label val joint yesno
order joint, before(wpcode)
tab joint, m

* Identify and drop duplicates
* Duplicates are often the same meeting in a different format:
* E.g. inclusion of third countries as in ESDP or JHA working groups,
* joint meetings with other working parties, or meetings in expert, attache,
* or friends of the presidency formation.
* Even if the different formations really met in parallel rather than after each other,
* they were likely talking about the same topics.
duplicates report wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0
duplicates tag wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0, gen(dup)
tab wpcode dup, m
gsort wpcode date slot -joint wpname_orig
list date slot wpcode wpcode2 wpcode3 joint wpname_orig if dup > 0 & dup != .
list date slot wpcode wpname if dup > 0 & dup != .

* Drop duplicates of groups that do not meet in the form of several formations
* Groups that meet in several formations:
* A10 Friends of the Presidency
* B09 Legal-linguistic experts
* F02 Financial Counsellors
* F06 Financial Services Attachees
* G37 JHA Counsellors
* K15 Competitiveness
* R06 International Environmental Issues

* Check whether these are really duplicates or whether they all meet in different formations
********************************************************************************************

list date slot wpcode wpcode2 wpcode3 joint wpname_orig if dup > 0 & dup != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06"
duplicates report wpcode date slot if dup > 0 & dup != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06"
duplicates drop wpcode date slot if dup > 0 & dup != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06", force
* 303 observations dropped 
drop dup

* Check remaining duplicates with working party coding
duplicates report wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0
duplicates tag wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0, gen(dup)
tab wpcode dup if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0, m
gsort wpcode date slot -joint wpname_orig
tab wpcode if dup > 0 & dup != .
list date slot wpcode wpcode2 wpcode3 joint dup wpname_orig if dup > 0 & dup != .
drop dup
* 167 largely correct duplicate observations

* Check remaining duplicates with general policy coding only
* They seem to be all correct
duplicates report wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00")
duplicates tag wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00"), gen(dup)
tab wpcode dup if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00"), m
gsort wpcode date slot -joint wpname_orig
tab wpcode if dup > 0 & dup != .
list date slot wpcode wpcode2 wpcode3 joint wpname_orig if dup > 0 & dup != .
drop dup
* 72 correct duplicate observations
* 239 duplicate observations in total

* No duplicates for joint meetings
duplicates report wpcode date slot if joint == 1

* Rename main working party code variable for reshape
rename wpcode wpcode1


***************************************************************
* Create dataset with meeting-working party as unit of analysis
***************************************************************

* Reshape into long format
reshape long wpcode, i(rno) j(wpcodeno)
label var wpcodeno "WP name number"
label var wpcode "WP code"
des, s

* Drop 'empty' observations (i.e. without a second or third working party code)
drop if wpcode == ""
des, s
* 80222 observations

* Identify and drop duplicates with working party coding
duplicates report wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0
* 140 duplicate observations
duplicates tag wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0, gen(dup)
gsort wpcode date slot -joint wpname_orig
list date wpcodeno slot wpcode wpname_orig if dup > 0 & dup != .
list date wpcodeno slot wpcode wpname_orig if dup > 0 & dup != . & wpcodeno == 1
list date wpcodeno slot wpcode wpname_orig if dup > 0 & dup != . & wpcodeno > 1
* The new duplicates are almost all joint meetings in which one of its component working parties
* is recorded as a separate meeting.  
* Drop the duplicate observation that records separately the meeting of a component group of a joint meeting
egen dupgroup = group(wpcode date slot) if dup > 0 & dup != .
by dupgroup, sort: egen wpcodeno_max = max(wpcodeno) if dupgroup != .
gsort wpcode date slot -wpcodeno
list date slot wpcodeno wpcodeno_max joint dupgroup wpcode wpname_orig if dup > 0 & dup != . /*
	*/ & wpcodeno_max > 1 & wpcodeno_max != .
list date slot wpcodeno wpcodeno_max joint dupgroup wpcode wpname_orig if dup > 0 & dup != . /*
	*/ & wpcodeno_max > 1 & wpcodeno_max != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06"
duplicates report wpcode date slot if dup > 0 & dup != . /*
	*/ & wpcodeno_max > 1 & wpcodeno_max != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06"
duplicates drop wpcode date slot if dup > 0 & dup != . /*
	*/ & wpcodeno_max > 1 & wpcodeno_max != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06", force
* 52 duplicate observations dropped
drop dup dupgroup wpcodeno_max

* Check remaining number of duplicates
*(should be the same number as before reshaping plus the number of groups with correct duplicates)
duplicates report wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00") == 0
* 140 duplicates remaining

* Identify and drop duplicates with general policy coding only
duplicates report wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00")
duplicates tag wpcode date slot if wpcode != "Z99" & wpcode != "Z00" & regexm(wpcode, "00"), gen(dup)
sort date slot wpcode wpname_orig
tab wpcode if dup > 0 & dup != .
list date wpcodeno slot wpcode wpname_orig if dup > 0 & dup != .
drop dup
* 72 correct duplicates (no additional ones)


*************************************************************
* Collapse observations if they have meetings on the same day
*************************************************************

* Drop meetings that have no code at all
drop if wpcode == "Z00" | wpcode == "Z99"
* 7551 observations deleted

* Generate time slot dummy variables
tab slot, m
generate am = 0
replace am = 1 if slot == 1 | slot == 4 | slot == 6 | slot == 7
label var am "Morning (yes/no)"
generate pm = 0
replace pm = 1 if slot == 2 | slot == 4 | slot == 5 | slot == 7
label var pm "Afternoon (yes/no)"
generate ev = 0
replace ev = 1 if slot == 3 | slot == 5 | slot == 6 | slot == 7
label var ev "Evening (yes/no)"
label def yesno 0 "No" 1 "Yes"
label val am yesno
label val pm yesno
label val ev yesno
tab slot am, m
tab slot pm, m
tab slot ev, m


* Merge complete duplicates (including identical original working party name)
*****************************************************************************
* Note: Only duplicats with half-day meetings are dropped
* If both observations are indicating a full day meeting, they are more likely
* to refer to different meetings (especially juristes/linguistes meetings)

* Drop observations with meeting on same day in the morning
gsort wpcode date -slot wpname_orig
duplicates report date wpcode wpname_orig am
duplicates tag date wpcode wpname_orig am, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & am == 1
drop if dup == 1 & am == 1 & pm == 0 & ev == 0
* 22 duplicates dropped
drop duplicate

* Drop observations with meeting on same day in the afternoon
gsort wpcode date -slot wpname_orig 
duplicates report date wpcode wpname_orig pm
duplicates tag date wpcode wpname_orig pm, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & pm == 1
drop if dup == 1 & am == 0 & pm == 1 & ev == 0
* 17 duplicates dropped
drop duplicate

* Drop observations with meeting on same day in the evening
gsort wpcode date -slot wpname_orig
duplicates report date wpcode wpname_orig ev
duplicates tag date wpcode wpname_orig ev, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & ev == 1
drop duplicate


* Merge partial duplicates with identical working party name (after standardization)
************************************************************************************
* Note: Only duplicats with half-day meetings are dropped
* If both observations are indicating a full day meeting, they are more likely
* to refer to different meetings (especially juristes/linguistes meetings)

* Drop observations with meeting on same day in the morning
gsort wpcode date -slot wpname
duplicates report date wpcode wpname am
duplicates tag date wpcode wpname am, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & am == 1
drop if dup == 1 & am == 1 & pm == 0 & ev == 0
* 34 observations dropped
drop duplicate

* Drop observations with meeting on same day in the afternoon
gsort wpcode date -slot wpname
duplicates report date wpcode wpname pm
duplicates tag date wpcode wpname pm, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & pm == 1
drop if dup == 1 & am == 0 & pm == 1 & ev == 0
* 17 observations deleted
drop duplicate

* Drop observations with meeting on same day in the evening
gsort wpcode date -slot wpname
duplicates report date wpcode wpname ev
duplicates tag date wpcode wpname ev, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & ev == 1
drop duplicate


* Merge partial duplicates with identical working party code only
*****************************************************************

* Drop observations with meeting on same day in the morning
************************************************************

* Observations with specific working group code
gsort wpcode date -slot wpname
duplicates report date wpcode am if regexm(wpcode, "00") == 0
duplicates tag date wpcode am if regexm(wpcode, "00") == 0, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & am == 1 /*
	*/ & regexm(wpcode, "00") == 0

list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & am == 1 /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06"
duplicates report date wpcode am if dup > 0 & dup != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06"
duplicates drop date wpcode am if dup > 0 & dup != . /*
	*/ & wpcode != "A10" /*
	*/ & wpcode != "B09" /*
	*/ & wpcode != "F02" /*
	*/ & wpcode != "F06" /*
	*/ & wpcode != "G37" /*
	*/ & wpcode != "K15" /*
	*/ & wpcode != "R06", force
* 237 duplicates dropped
drop duplicate

* Observations with generic policy code only
gsort wpcode date -slot wpname
duplicates report date wpcode am if regexm(wpcode, "00")
duplicates tag date wpcode am if regexm(wpcode, "00"), gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & am == 1 /*
	*/ & regexm(wpcode, "00")
* Mostly separate meetings
drop duplicate


* Drop observations with meeting on same day in the afternoon
gsort wpcode date -slot wpname
duplicates report date wpcode wpname pm if regexm(wpcode, "00") == 0
duplicates tag date wpcode wpname pm if regexm(wpcode, "00") == 0, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & pm == 1 /*
	*/ & regexm(wpcode, "00") == 0
* No real duplicates
drop duplicate

gsort wpcode date -slot wpname
duplicates report date wpcode pm if regexm(wpcode, "00")
duplicates tag date wpcode pm if regexm(wpcode, "00"), gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & pm == 1 /*
	*/ & regexm(wpcode, "00")
* Mostly separate meetings
drop duplicate

* Drop observations with meeting on same day in the evening
gsort wpcode date -slot wpname
duplicates report date wpcode wpname ev if regexm(wpcode, "00") == 0
duplicates tag date wpcode wpname ev if regexm(wpcode, "00") == 0, gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & ev == 1 /*
	*/ & regexm(wpcode, "00") == 0
drop duplicate

gsort wpcode date -slot wpname
duplicates report date wpcode ev if regexm(wpcode, "00")
duplicates tag date wpcode ev if regexm(wpcode, "00"), gen(duplicate)
list date slot wpcode wpname_orig ev am pm if dup > 0 & dup != . & ev == 1 /*
	*/ & regexm(wpcode, "00")
drop duplicate

* Generate meeting time variable
generate dur = .
replace dur = am + pm + ev
label var dur "Meeting duration"
tab dur, m
tab slot dur, m
order dur, before(year)


* Save data
sort date slot wpcode
compress
des, s
save "Data management\wpar-management06-workingparties03-merge.dta", replace

* Exit
log close
exit
